###############################################
#                                             #
#     National Survey Data Cleaning           #
#     Created by Mac Lockhart Nov 29 2022     #
#     Updated Dec 1 2022                      #
#                                             #
###############################################

#This code recodes variables to standard outcomes and adds survey weights to the data

if(Sys.info()[7] == "maclockhart"){
  setwd("/Users/maclockhart/Dropbox/School/RA Work/'22 Seth & Thad/MIT Evolving Election Administration Landscape")
}
library(tidyverse)
library(RColorBrewer)
library(car)
library(data.table)

DF<- read_csv("Survey Data/National.csv")
DF <- DF[c(-1, -2),] #Drop header
DF <- DF[!is.na(DF$`Own State Post`),] #Subset to complete respondents to ensure balance

#Recode party ID, race, ethnicity, income, education, treatment status, and gender to categorical variables. 
DF <- DF %>% 
  mutate(party = dplyr::recode(political_party,
                          "1" = "Democrat",
                          "2" = "Republican",
                          "3" = "Independent",
                          "4" = "Independent"),
         race_full=dplyr::recode(ethnicity,
                          "1" = "White",
                          "2" = "Black, or African American",
                          "3" = "American Indian or Alaska Native",
                          "4" = "Asian *** Asian Indian", 
                          "5" = "Asian *** Chinese", 
                          "6" = "Asian *** Filipino", 
                          "7" = "Asian *** Japanese", 
                          "8" = "Asian *** Korean", 
                          "9" = "Asian *** Vietnamese", 
                          "10" = "Asian *** Other", 
                          "11" = "Pacific Islander *** Native Hawaiian", 
                          "12" = "Pacific Islander *** Guamanian", 
                          "13" = "Pacific Islander *** Samoan", 
                          "14" = "Pacific Islander *** Other Pacific Islander", 
                          "15" = "Some other race", 
                          "16" = "Prefer not to answer"),
         race_short=dplyr::recode(ethnicity,
                           "1" = "White",
                           "2" = "Black, or African American",
                           "3" = "American Indian or Alaska Native",
                           "4" = "Asian", 
                           "5" = "Asian", 
                           "6" = "Asian", 
                           "7" = "Asian", 
                           "8" = "Asian", 
                           "9" = "Asian", 
                           "10" = "Asian", 
                           "11" = "Pacific Islander", 
                           "12" = "Pacific Islander", 
                           "13" = "Pacific Islander", 
                           "14" = "Pacific Islander", 
                           "15" = "Some other race", 
                           "16" = "Prefer not to answer"),
         ethnicity_full=dplyr::recode(hispanic,
                               "1" = "Not Hispanic",
                               "2" = "Hispanic - Mexican, Mexican American, Chicano",
                               "3" = "Hispanic - Cuban",
                               "4" = "Hispanic - Argentina",
                               "5" = "Hispanic - Colombia",
                               "6" = "Hispanic - Ecuador",
                               "7" = "Hispanic - El Salvadore",
                               "8" = "Hispanic - Guatemala",
                               "9" = "Hispanic - Nicaragua",
                               "10" = "Hispanic - Panama",
                               "11" = "Hispanic - Peru",
                               "12" = "Hispanic - Spain",
                               "13" = "Hispanic - Venezuela",
                               "14" = "Hispanic - Other Country",
                               "15" = "Prefer not to answer",
                               "16" = "Hispanic - Puerto Rican"),
         ethnicity_short=dplyr::recode(hispanic,
                                "1" = "Not Hispanic",
                                "2" = "Hispanic",
                                "3" = "Hispanic",
                                "4" = "Hispanic",
                                "5" = "Hispanic",
                                "6" = "Hispanic",
                                "7" = "Hispanic",
                                "8" = "Hispanic",
                                "9" = "Hispanic",
                                "10" = "Hispanic",
                                "11" = "Hispanic",
                                "12" = "Hispanic",
                                "13" = "Hispanic",
                                "14" = "Hispanic",
                                "15" = "Prefer not to answer",
                                "16" = "Hispanic"),
         household_income=dplyr::recode(hhi,
                                 "1" = "Less than $14,999",
                                 "2" = "$15,000 to $19,999",
                                 "3" = "$20,000 to $24,999",
                                 "4" = "$25,000 to $29,999",
                                 "5" = "$30,000 to $34,999",
                                 "6" = "$35,000 to $39,999",
                                 "7" = "$40,000 to $44,999",
                                 "8" = "$45,000 to $49,999",
                                 "9" = "$50,000 to $54,999",
                                 "10" = "$55,000 to $59,999",
                                 "11" = "$60,000 to $64,999",
                                 "12" = "$65,000 to $69,999",
                                 "13" = "$70,000 to $74,999",
                                 "14" = "$75,000 to $79,999",
                                 "15" = "$80,000 to $84,999",
                                 "16" = "$85,000 to $89,999",
                                 "17" = "$90,000 to $94,999",
                                 "18" = "$95,000 to $99,999",
                                 "19" = "$100,000 to $124,999",
                                 "20" = "$125,000 to $149,999",
                                 "21" = "$150,000 to $174,999",
                                 "22" = "$175,000 to $199,999",
                                 "23" = "$200,000 to $249,999",
                                 "24" = "$250,000 and above",
                                 "-3105" = "Prefer not to answer"),
         education_full=dplyr::recode(education,
                               "1" = "Some high school or less",
                               "2" = "High school graduate",
                               "3" = "Other post high school vocational training",
                               "4" = "Completed some college, but no degree",
                               "5" = "Associate's degree",
                               "6" = "Bachelor's degree",
                               "7" = "Master's or professional degree",
                               "8" = "Doctorate degree",
                               "-3105" = "None of the above"),
         d_treatment = dplyr::recode(FL_25_DO,
                              "NationalTreatment-Control" = 0,
                              "NationalTreatment-Emotion" = 1,
                              "NationalTreatment-Facts" = 1),
         treatment = dplyr::recode(FL_25_DO,
                            "NationalTreatment-Control" = "Control",
                            "NationalTreatment-Emotion" = "Emotions",
                            "NationalTreatment-Facts" = "Facts"),
         gender_text = dplyr::recode(gender,
                              "1" = "Male",
                              "2" = "Female"))

DF$age <- as.numeric(DF$age)

#seperate multiple entry choices

#Information sources
DF$info_youtube<-grepl("YouTube", DF$`Info Sources List`)
DF$info_tiktok<-grepl("TikTok", DF$`Info Sources List`)
DF$info_instagram<-grepl("Instagram", DF$`Info Sources List`)
DF$info_other_sm<-grepl("Another social media network", DF$`Info Sources List`)
DF$info_twitter<-grepl("Twitter", DF$`Info Sources List`)
DF$info_television<-grepl("Television", DF$`Info Sources List`)
DF$info_internet<-grepl("Searching the internet", DF$`Info Sources List`)
DF$info_facebook <-grepl("Facebook", DF$`Info Sources List`)
DF$info_radio<-grepl("Radio", DF$`Info Sources List`)
DF$info_news<-grepl("Newspapers", DF$`Info Sources List`)

DF$sources_election_officials<-grepl("Local and state elections officials", DF$`Information Sources`)
DF$sources_television<-grepl("Television news in my local area", DF$`Information Sources`)
DF$sources_fox<-grepl("Fox News", DF$`Information Sources`)
DF$sources_cnn<-grepl("CNN", DF$`Information Sources`)
DF$sources_politicians<-grepl("Political leaders in my party", DF$`Information Sources`)

DF$issues_hours<-grepl("Hours available for in-person voting", DF$Issues)
DF$issues_in_person_locations<-grepl("Accessibility of in-person voting location", DF$Issues)
DF$issues_assistance<-grepl("Assistance of election workers", DF$Issues)
DF$issues_language<-grepl("Lack of voting materials in preferred languages", DF$Issues)
DF$issues_electronic_machines<-grepl("Ease or difficulty of using electronic voting machines", DF$Issues)
DF$issues_finding_location<-grepl("Locating in-person voting location address", DF$Issues)
DF$issues_vote_by_mail<-grepl("Ease or difficulty of voting by mail", DF$Issues)
DF$issues_voting_information<-grepl("Lack of voting information or materials", DF$Issues)
DF$issues_not_confident_counted<-grepl("Not confident ballot will be counted correctly", DF$Issues)
DF$issues_long_line<-grepl("A long line at the location where I voted", DF$Issues)

DF$disability_hearing<-grepl("Hearing", DF$Disabilities)
DF$disability_seeing<-grepl("Seeing", DF$Disabilities)
DF$disability_walking<-grepl("Walking", DF$Disabilities)
DF$disability_hands<-grepl("Using your hands", DF$Disabilities)
DF$disability_reading<-grepl("Reading", DF$Disabilities)
DF$disability_talking<-grepl("Talking", DF$Disabilities)
DF$disability_thinking<-grepl("Thinking", DF$Disabilities)
DF$disability_remembering<-grepl("Remembering", DF$Disabilities)
DF$disability_none<-grepl("None", DF$Disabilities)

###Recode outcome variables per the pre-analysis plan
DF<-DF %>%
  mutate(ownstatepost = dplyr::recode(`Own State Post`,
                                      "Trust a lot" = 4,
                                      "Trust some" = 3,
                                      "Distrust some" = 2,
                                      "Distrust a lot" = 1),
         ownstatepre = dplyr::recode(`Trust Own State`,
                                     "Trust a lot" = 4,
                                     "Trust some" = 3,
                                     "Distrust some" = 2,
                                     "Distrust a lot" = 1),
         otherstatepost = dplyr::recode(`Other State Post`,
                                        "Trust a lot" = 4,
                                        "Trust some" = 3,
                                        "Distrust some" = 2,
                                        "Distrust a lot" = 1),
         otherstatepre = dplyr::recode(`Trust Other States`,
                                       "Trust a lot" = 4,
                                       "Trust some" = 3,
                                       "Distrust some" = 2,
                                       "Distrust a lot" = 1),
         votefraudpost = dplyr::recode(`Vote fraud post`,
                                       "Vote fraud almost never occurs" = 5,
                                       "Vote fraud occurs infrequently" = 4,
                                       "Vote fraud occurs about half of the time" = 3,
                                       "Vote fraud is very common" = 2,
                                       "Vote fraud happens all of the time" = 1),
         votefraudpre = dplyr::recode(`Vote fraud`,
                                      "Vote fraud almost never occurs" = 5,
                                      "Vote fraud occurs infrequently" = 4,
                                      "Vote fraud occurs about half of the time" = 3,
                                      "Vote fraud is very common" = 2,
                                      "Vote fraud happens all of the time" = 1),
         officialfraudpost = dplyr::recode(`Official fraud post`,
                                           "Fraud by official state or county election authorities almost never occurs" = 5,
                                           "Fraud by official state or county election authorities occurs infrequently" = 4,
                                           "Fraud by official state or county election authorities occurs about half of the time" = 3,
                                           "Fraud by official state or county election authorities is very common" = 2,
                                           "Fraud by official state or county election authorities happens all of the time" = 1),
         officialfraudpre = dplyr::recode(`Official Fraud`,
                                          "Fraud by official state or county election authorities almost never occurs" = 5,
                                          "Fraud by official state or county election authorities occurs infrequently" = 4,
                                          "Fraud by official state or county election authorities occurs about half of the time" = 3,
                                          "Fraud by official state or county election authorities is very common" = 2,
                                          "Fraud by official state or county election authorities happens all of the time" = 1),
         vote2024post = dplyr::recode(`2024 vote post`,
                                  "Definitely will vote" = 5,
                                  "Probably will vote" = 4,
                                  "May or may not vote" = 3,
                                  "Probably will not vote" = 2,
                                  "Definitely will not vote" = 1),
         vote2024pre = dplyr::recode(`Vote likelihood`,
                                     "Definitely will vote" = 5,
                                     "Probably will vote" = 4,
                                     "May or may not vote" = 3,
                                     "Probably will not vote" = 2,
                                     "Definitely will not vote" = 1))

###CODE TO ADD SURVEY WEIGHTS BELOW###
# Common variables. 
the.vars <- c('ageXeduc','raceXethnic', 'gender')

# ACS targets
pop.targs <- list(
  # ageXeduc
  data.frame(ageXeduc=c('18-24 & 1', '25-44 & 1', '45-64 & 1', '65-105 & 1', 
                        '18-24 & 2', '25-44 & 2', '45-64 & 2', '65-105 & 2',
                        '18-24 & 3', '25-44 & 3', '45-64 & 3', '65-105 & 3',
                        '18-24 & 4', '25-44 & 4', '45-64 & 4', '65-105 & 4'),
             Freq=nrow(DF)*c(0.054578084, 0.102256133, 0.128921114, 0.09780378,
                             0.05757399, 0.106680626, 0.105207149, 0.053504445,
                             0.012638391, 0.078148602, 0.065091858, 0.030753835,
                             0.000886397, 0.038932675, 0.041434738, 0.025588184)),
  #raceXethnicity
  data.frame(raceXethnic=c('white & hisp', 'black & hisp', 'other & hisp', 
                           'white & not hisp', 'black & not hisp','other & not hisp'), 
             Freq=nrow(DF)*c(0.08151393, 0.002810298, 0.03612324,
                             0.685729959, 0.12433179, 0.06949078)),
  #gender
  data.frame(gender_cat=c('male','female'), Freq=nrow(DF)*c(0.484736836, 0.515263164)))


#input missing values
DF<-as.data.table(DF)
DF[education=="-3105"]$education <- NA
DF[ethnicity==16]$ethnicity <- NA
DF[hispanic==15]$hispanic <- NA

library(car)

# =================
# Age category.
# Fill in missing values.
set.seed(20181003)
DF[is.na(age), age := sample(DF[!is.na(age),age],.N,replace=T)]
#recode to categorical
DF[,age_cat := recode(age,"18:24='18-24';25:44='25-44';45:64='45-64';65:99='65-105'")]
DF[,table(age,age_cat)]
any(is.na(DF$age_cat))

# =================
# Educ.
# Fill in missing values.
set.seed(20181003)
DF[is.na(education), education := sample(DF[!is.na(education),education],.N,replace=T)]
# Recode to five categories
DF[,educ_cat := recode(education,"1:3=1;4:5=2;6=3;7:8=4")]
any(is.na(DF$educ_cat))

# =================
# AgeXEduc.
DF[,ageXeduc := sprintf("%s & %s",age_cat,educ_cat)]
any(is.na(DF$ageXeduc))

#==================
#Race
# Fill in missing values.
set.seed(20181003)
DF[is.na(ethnicity), ethnicity := sample(DF[!is.na(ethnicity),ethnicity],.N,replace=T)]
any(is.na(DF$ethnicity))
#recode to 4 values
DF[,race_cat := recode(ethnicity,"1='white';2='black';4='other';5='other';6='other';7='other';
                       8='other';9='other';10='other';11:15='other';3='other'")]
any(is.na(DF$race_cat))

#==================
#Ethnicity
# Fill in missing values.
set.seed(20181003)
DF[is.na(hispanic), hispanic := sample(DF[!is.na(hispanic),hispanic],.N,replace=T)]
any(is.na(DF$hispanic))
#recode to 4 values
DF[,ethnic_cat := recode(hispanic,"1='not hisp';2='hisp';3='hisp';4='hisp';5='hisp';6='hisp';
                         7='hisp';8='hisp';9='hisp';10='hisp';11='hisp';12='hisp';13='hisp';
                         14='hisp';15='hisp';16='hisp'")]
any(is.na(DF$ethnic_cat))

# =================
# RaceXEthnic
DF[,raceXethnic := sprintf("%s & %s",race_cat,ethnic_cat)]
any(is.na(DF$raceXethnic))


#==================
#Gender
# Fill in missing values.
set.seed(20181003)
DF[is.na(gender), gender := sample(DF[!is.na(gender),gender],.N,replace=T)]
any(is.na(DF$gender))
DF[,gender_cat := recode(gender,"1='male';2='female'")]
any(is.na(DF$gender_cat))


any(is.na(DF$ageXeduc))
any(is.na(DF$raceXethnic))
any(is.na(DF$gender_cat))

#
# =================
# Create weights via raking.
# =================
#

library(survey)
# Create survey design object for Lucid data.
DF.svy <- svydesign(ids=~1,data=DF,weights=NULL)

library(survey)
# Rake to each population margin.
DF.svy.rk <- rake(DF.svy, sample.margins=list(~ageXeduc, ~raceXethnic, ~gender_cat), population.margins=pop.targs)


# Present individuals with largest and smallest weights.
cat("Summary of distribution of weights before trimming:\n")
the.wts <- weights(DF.svy.rk)
print(summary(the.wts))
print(quantile(the.wts,probs=seq(.1,.9,.1)))
cat("Observations with max and min weights assigned. \nMax:\n")
print(DF[the.wts == max(the.wts),the.vars,with=F])
cat("Min:\n")
print(DF[the.wts == min(the.wts),the.vars,with=F])

# Trim the weights?
if (max(the.wts) > 8) {
  DF.svy.rk2 <- trimWeights(DF.svy.rk,lower=1/8,upper=8,strict=T)
  cat("Summary of distribution of weights after trimming:\n")
  print(summary(the.wts <- weights(DF.svy.rk2)))
  cat("Observations with max and min weights aDFgned.\nMax:\n")
  print(DF[the.wts == max(the.wts),the.vars,with=F])
  cat("Min:\n")
  print(DF[the.wts == min(the.wts),the.vars,with=F])
} else {
  DF.svy.rk2 <- DF.svy.rk
}

# Check.
#y <- as.vector(svytable(~ageXeduc,DF.svy.rk))
#x <- as.vector(svytable(~ageXeduc,DF.svy))
#plot(x=x, y=y, xlim=range(c(y,x)), ylim=range(c(y,x)), xlab="Original ageXeduc counts",ylab="Raked ageXeduc counts",log="xy")

# Save weights to csv along with resp.id.
df<-cbind(DF[,1:(length(DF)-7)], the.wts)
write.csv(df, "Survey Data/National Recoded.csv")


